#R version- R-4.5.0
# Load required libraries
library(factoextra)
library(cluster)

# Load data
df <- USArrests

# Print number of attributes and number of instances
cat("Number of attributes:", ncol(df), "\n")
cat("Number of instances:", nrow(df), "\n")

# Remove rows with missing values
df <- na.omit(df)

# Scale each variable to have a mean of 0 and sd of 1
df <- scale(df)

# View first six rows of dataset
head(df)

# Define linkage methods
m <- c("average", "single", "complete", "ward")
names(m) <- c("average", "single", "complete", "ward")

# Function to compute agglomerative coefficient
ac <- function(x) {
  agnes(df, method = x)$ac
}

# Calculate agglomerative coefficient for each clustering linkage method
sapply(m, ac)

# Perform hierarchical clustering using Ward's method
clust <- agnes(df, method = "ward")

# Produce dendrogram
pltree(clust, cex = 0.6, hang = -1, main = "Dendrogram") 

# Calculate gap statistic for each number of clusters (up to 10 clusters)
gap_stat <- clusGap(df, FUN = hcut, nstart = 25, K.max = 10, B = 50)

# Produce plot of clusters vs. gap statistic
fviz_gap_stat(gap_stat)

# Compute distance matrix
d <- dist(df, method = "euclidean")

# Perform hierarchical clustering using Ward's method
final_clust <- hclust(d, method = "ward.D2")

# Cut the dendrogram into 4 clusters
groups <- cutree(final_clust, k = 4)

# Find number of observations in each cluster
table(groups)

# Append cluster labels to original data
final_data <- cbind(USArrests, cluster = groups)

# Display first six rows of final data
head(final_data)

# Find mean values for each cluster
aggregate(final_data, by = list(cluster = final_data$cluster), mean)


# Calculate gap statistic again (if needed)
gap_stat <- clusGap(df, FUN = hcut, nstart = 25, K.max = 10, B = 50)
fviz_gap_stat(gap_stat)

# Find optimal number of clusters (FIXED LINE)
optimal_clusters <- which.max(gap_stat$Tab[, "gap"])

# Perform clustering using optimal number of clusters
final_clust <- hclust(d, method = "ward.D2")
groups <- cutree(final_clust, k = optimal_clusters)

# Append new cluster labels
final_data <- cbind(USArrests, cluster = groups)

# Display mean values for each new cluster
aggregate(final_data, by = list(cluster = final_data$cluster), mean)
